# import jieba.posseg as psg
# import jieba
#
# words="就保持无感，每天才会有幸福感！"
# word=jieba.lcut(words)
# print(word)
# cixin=psg.lcut(words)
# print(cixin)
#
import re
import requests


def get_reply(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    # 发送请求
    request = requests.get(url=url, headers=headers)
    # 返回html源代码
    # print(request.text)
    html = request.text
    return html

def parse_reply(html):
    # 发帖人
    author = re.compile('<a.*?class="p_author_name.*?.*?>(.*?)</a>')
    authors = re.findall(author, html)
    # print(authors)
    #发帖内容
    content = re.compile('<div.*?class="d_post_content j_d_post_content ".*?>(.*?)</div>')
    contents = re.findall(content, html)
    # print(contents1)
    # 发帖时间
    time = re.compile('<span class="tail-info">(.*?)</span>')
    times = re.findall(time, html)
    #帖子页数
    pageNumber = re.compile(r'共<span class="red">(.*?)</span>')
    pageNumbers = re.findall(pageNumber, html)
    # return authors, times, contents, pageNumbers
    with open("作业.csv", 'a', encoding='utf-8') as f:
        f.write("发帖内容,发帖人,发帖时间\n")
        for i in range(len(contents)):
            f.write(f'"{contents[i]}"'+","+authors[i]+","+times[i]+"\n")
    return pageNumbers

def main():
    url="https://tieba.baidu.com/p/7255078874"
    html = get_reply(url)
    pageNumbers=parse_reply(html)
    print("该帖子一共" + str(pageNumbers) + "页")
    x=input("请输入你想要爬取的页数（不得超过贴吧总页数）： ")
    for i in range(int(x)):
        url = f"https://tieba.baidu.com/p/7255078874?pn="+str(i+1)
        html = get_reply(url)
        parse_reply(html)

if __name__ == '__main__':
    main()
    print("ok")